from sklearn import datasets
import pandas as pd
import matplotlib as plt
import numpy as np
import math, scipy


iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=["s_length", "s_width", "p_length", "p_width"])
df.head()


df.plot(x="p_length", y="p_width", kind="scatter",
       title="Petal Length v Petal Width")

<AxesSubplot:title={'center':'Petal Length v Petal Width'}, xlabel='p_length', ylabel='p_width'>


mean = lambda nums: sum(nums)/len(nums)
dev = lambda nums: math.sqrt(sum([(n-mean(nums))**2 for n in nums])/(len(nums)-1))
def correlation(x_data: list, y_data: list) -> float:
       
    # first get means
    xbar = mean(x_data)
    ybar = mean(y_data)
    
    # and deviations
    xdev = dev(x_data)
    ydev = dev(y_data)

    # create a new list of the points subracted from means over devs
    pairs = list(zip(x_data, y_data))
    sum_these = [((x-xbar)/xdev)*((y-ybar)/ydev) for x,y in pairs]
    
    return sum(sum_these)/(len(x_data)-1)


print(correlation(
    x_data = df.p_length, 
    y_data = df.p_width)
)
print(scipy.stats.pearsonr(df.p_length, df.p_width)[0])

0.962865431402796
0.9628654314027963


class LeastSquaresLine:
    """Object for least squares return"""
    def __init__(self):
        self.intercept = None
        self.slope = None
    
    def __repr__(self):
        sign = "+" if self.intercept > 0 else ""
        return f"y={round(self.slope,3)}x{sign}{round(self.intercept,3)}"


def least_sqaures(x_data: list, y_data: list) -> LeastSquaresLine:
    # Calculate slope & intercept
    lsl = LeastSquaresLine()
    lsl.slope = correlation(x_data, y_data)*(dev(y_data)/dev(x_data))
    lsl.intercept = mean(y_data) - lsl.slope*mean(x_data)
    return lsl


lsl = least_sqaures(df.p_length, df.p_width)


df.plot(x="p_length", y="p_width", kind="scatter",
       title="Petal Length v Petal Width")
plt.pyplot.plot(
    df.p_length, lsl.slope*df.p_length+lsl.intercept,
    color="red", label=lsl  # Label is the repr of the return
)
plt.pyplot.legend()

<matplotlib.legend.Legend at 0x227dfa0e0a0>

	s_length	s_width	p_length	p_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

Building and plotting a Least-Squares Regression Line by hand¶

Import our needed modules¶

For all the excercises in this notebook, we will use the popular Iris dataset.¶

Since we are not trying to predict anything about the classes for now, we will just use the features provided to explore ideas¶

Correlation¶

Lets quickly verify this function agaisnt scipy¶

Least Squares Regression Line Equation¶

	s_length	s_width	p_length	p_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	s_length	s_width	p_length	p_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	s_length	s_width	p_length	p_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2